In [1]:
# Set up path to files to be used for testing
test_data_path = "/Users/wbrierley/Documents/Jupyter Notebooks/Document Parsing Data/"
test_filename = "Brierley Bill Full CV 20180812.docx"
docxfile = test_data_path + test_filename
In [2]:
import zipfile
In [11]:
mydocx = zipfile.ZipFile(docxfile)
mydocx.namelist()
Out[11]:
In [18]:
with mydocx.open('[Content_Types].xml') as component:
print(component.read().getroot()
In [ ]:
import xml.etree.ElementTree as ET
In [ ]:
def opendocx(file):
'''Open a docx file, return a document XML tree'''
mydoc = zipfile.ZipFile(file)
xmlcontent = ET.parse(mydoc)
#document = ET.fromstring(mydoc)
return document
In [ ]:
doc_root = doc.getroot()
In [ ]: